In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymc3 as pm
import theano.tensor as tt

from theano.compile.ops import as_op
from scipy.stats import norm
from matplotlib import gridspec
from matplotlib.patches import Rectangle
from IPython.display import Image
import seaborn as sns
import pymc3 as pm
import numpy as np
from scipy.stats import norm
import pandas as pd

import theano.tensor as T
from theano.compile.ops import as_op

%matplotlib notebook
plt.style.use('seaborn-white')

color = '#87ceeb'

f_dict = {'size':14}
WARNING (theano.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
In [2]:
df2 = pd.read_csv('../data/pseudonymized-data.csv', dtype={'Community':'category',
                                  'V27':'category',
                                  'V28':'category',
                                  'V29':'category',
                                  'V30':'category',
                                  'V31':'category',
                                  'V32':'category',
                                  'V33':'category'})
df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387 entries, 0 to 386
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Community  387 non-null    category
 1   V27        387 non-null    category
 2   V28        387 non-null    category
 3   V29        387 non-null    category
 4   V30        387 non-null    category
 5   V31        387 non-null    category
 6   V32        387 non-null    category
 7   V33        387 non-null    category
dtypes: category(8)
memory usage: 5.3 KB
In [3]:
# reduce data values by 1 so that they become indexes for the cdf array below.
# the .cat.codes function from pandas doesn't work because it assings codes depending on the different values of the data
def from_data_to_code(x):
    return int(x)-1
def codes(self):
    return np.array([from_data_to_code(x) for x in self])
In [4]:
# sns.countplot(x=df2[df2.Community=='Comm.Linux_Kernel'].Community,hue=df2[df2.Community=='Comm.Linux_Kernel'].V27)
# codes(df2[df2.V28!='-1'].V28)
# df2.Community.cat.codes.values
# df2[df2.Community=='Comm.Linux_Kernel'].Community
df2.Community.cat.codes.values.shape
Out[4]:
(387,)
In [8]:
 
thresh2:	[1.5 2.5 3.5 4.5]
thresh_obs2:	[1.5 -- -- 4.5]
In [194]:
def infer(q):
    # Number of outcomes
    nYlevels2 = 5 # df2.V28.cat.categories.size
    # Number of groups
    n_grps = df2.Community.nunique()
    # Group index
    grp_idx = df2[df2[q]!='-1'].Community.cat.codes.values

    thresh2 = np.arange(1.5, nYlevels2, dtype=np.float64)
    thresh_obs2 = np.ma.asarray(thresh2)
    thresh_obs2[1:-1] = np.ma.masked

    print('thresh2:\t{}'.format(thresh2))
    print('thresh_obs2:\t{}'.format(thresh_obs2))


    @as_op(itypes=[tt.dvector, tt.dvector, tt.dvector], otypes=[tt.dmatrix])
    def outcome_probabilities(theta, mu, sigma):
        out = np.empty((nYlevels2, n_grps), dtype=np.float64)
        n = norm(loc=mu, scale=sigma)       
        out[0,:] = n.cdf(theta[0])        
        out[1,:] = np.max([np.repeat(0,n_grps), n.cdf(theta[1]) - n.cdf(theta[0])], axis=0)
        out[2,:] = np.max([np.repeat(0,n_grps), n.cdf(theta[2]) - n.cdf(theta[1])], axis=0)
        out[3,:] = np.max([np.repeat(0,n_grps), n.cdf(theta[3]) - n.cdf(theta[2])], axis=0)
        out[4,:] = 1 - n.cdf(theta[3])
        return out

    with pm.Model() as ordinal_model_multi_groups:    
        
        theta = pm.Normal('theta', mu=thresh2, tau=np.repeat(.5**2, len(thresh2)),
                          shape=len(thresh2), observed=thresh_obs2)
        
        mu = pm.Normal('mu', mu=nYlevels2/2.0, tau=1.0/(nYlevels2**2), shape=n_grps)
        sigma = pm.Uniform('sigma', nYlevels2/1000.0, nYlevels2*10.0, shape=n_grps)
        
        pr = outcome_probabilities(theta, mu, sigma)
        
        y = pm.Categorical('y', pr[:,grp_idx].T, observed=codes(df2[df2[q]!='-1'][q]))    #df2.V28.cat.codes.to_numpy())
    
        trace2 = pm.sample(5000, tune=1000, cores=4)
    
        return trace2

t=infer('V33')
thresh2:	[1.5 2.5 3.5 4.5]
thresh_obs2:	[1.5 -- -- 4.5]
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS failed. Falling back to elementwise auto-assignment.
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>Slice: [sigma]
>Slice: [mu]
>Slice: [theta_missing]
Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 2582 seconds.
The number of effective samples is smaller than 25% for some parameters.
In [10]:
with ordinal_model_multi_groups:
    trace2 = pm.sample(5000, tune=1000, cores=4)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS failed. Falling back to elementwise auto-assignment.
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>Slice: [sigma]
>Slice: [mu]
>Slice: [theta_missing]
Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 2599 seconds.
The number of effective samples is smaller than 25% for some parameters.
In [222]:
## Mean density plots of answers to variable v for all communities
def mean_densities(t,v):
    plt.figure(figsize=(15,5))
    NUM_COLORS = 16
    cmap = plt.get_cmap('tab20')
    colors = [cmap(i) for i in np.linspace(0, 1, NUM_COLORS)]
    grp_idxs_colors_grp_names = zip(df2.Community.cat.codes.unique(),colors,df2.Community.unique())
    [pm.plot_kde(t['mu'][:,i], 
                 plot_kwargs={'color': c}, cumulative=False, 
                 label=n) 
      for (i,c,n) in grp_idxs_colors_grp_names]
    plt.xlim([0,7])
    plt.legend(loc='best')
    plt.suptitle('Mean density plots for '+v)
    plt.show()
In [228]:
# mean_densities(t,'V33')
pm.plot_kde(t['mu'][:,4])
plt.xlim([0,7])
plt.suptitle('DuckDuckGo question V33')
Out[228]:
Text(0.5, 0.98, 'DuckDuckGo question V33')
2020-09-18T13:02:31.430377 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
In [196]:
pm.summary(t)
Out[196]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
theta_missing[0] 2.686 0.122 2.464 2.921 0.002 0.002 3144.0 3144.0 3143.0 6126.0 1.0
theta_missing[1] 3.500 0.141 3.237 3.763 0.003 0.002 3112.0 3112.0 3114.0 6132.0 1.0
mu[0] 2.318 0.297 1.750 2.864 0.002 0.002 16336.0 16102.0 16311.0 14231.0 1.0
mu[1] 2.563 2.487 -2.069 8.362 0.025 0.025 9952.0 4811.0 11684.0 6688.0 1.0
mu[2] 2.421 0.279 1.888 2.941 0.003 0.002 11239.0 10767.0 11142.0 10852.0 1.0
mu[3] 1.762 0.467 1.010 2.541 0.007 0.006 3882.0 2647.0 8323.0 4769.0 1.0
mu[4] 3.020 5.034 -6.612 12.280 0.042 0.037 14542.0 9067.0 14617.0 11702.0 1.0
mu[5] 2.103 0.241 1.635 2.548 0.002 0.002 10534.0 10223.0 10472.0 10459.0 1.0
mu[6] 2.988 0.250 2.529 3.461 0.003 0.002 7177.0 7032.0 7229.0 9098.0 1.0
mu[7] 2.410 3.465 -4.313 9.490 0.032 0.032 11505.0 6037.0 12448.0 8636.0 1.0
mu[8] 2.395 0.327 1.791 2.921 0.005 0.005 3900.0 2591.0 6467.0 5800.0 1.0
mu[9] 2.827 0.405 2.030 3.555 0.004 0.003 11099.0 10210.0 11584.0 9074.0 1.0
mu[10] 1.003 4.162 -6.829 9.410 0.038 0.034 11818.0 7341.0 12373.0 10003.0 1.0
mu[11] 2.827 0.322 2.202 3.424 0.003 0.002 10068.0 9760.0 10113.0 10534.0 1.0
mu[12] 2.568 0.355 1.889 3.237 0.003 0.002 12321.0 11178.0 12505.0 9945.0 1.0
mu[13] 3.024 0.196 2.664 3.399 0.003 0.002 5300.0 5282.0 5299.0 8925.0 1.0
mu[14] 1.798 0.856 0.064 3.290 0.009 0.006 9573.0 8825.0 10430.0 8770.0 1.0
mu[15] 2.909 1.766 0.340 6.686 0.042 0.034 1769.0 1319.0 4059.0 1836.0 1.0
sigma[0] 1.179 0.283 0.700 1.705 0.003 0.002 11453.0 11123.0 12171.0 12324.0 1.0
sigma[1] 5.459 8.470 0.005 22.099 0.132 0.093 4106.0 4106.0 3847.0 6822.0 1.0
sigma[2] 1.189 0.264 0.753 1.679 0.003 0.002 9989.0 9355.0 11174.0 9736.0 1.0
sigma[3] 0.839 0.935 0.053 1.874 0.017 0.012 2871.0 2871.0 5329.0 4475.0 1.0
sigma[4] 26.839 14.285 4.077 49.998 0.116 0.088 15227.0 13078.0 14158.0 9135.0 1.0
sigma[5] 1.142 0.243 0.744 1.603 0.003 0.002 6840.0 6840.0 6984.0 10388.0 1.0
sigma[6] 0.923 0.210 0.560 1.312 0.002 0.002 9446.0 8771.0 10621.0 9553.0 1.0
sigma[7] 11.585 11.169 0.679 35.513 0.152 0.107 5407.0 5407.0 6000.0 6950.0 1.0
sigma[8] 0.540 0.400 0.044 1.122 0.007 0.005 3672.0 3378.0 7730.0 6721.0 1.0
sigma[9] 1.194 0.403 0.604 1.912 0.005 0.003 7601.0 7348.0 8954.0 8514.0 1.0
sigma[10] 17.990 14.050 0.204 44.491 0.159 0.114 7855.0 7641.0 7994.0 8534.0 1.0
sigma[11] 1.223 0.299 0.747 1.780 0.003 0.002 9019.0 8369.0 10750.0 10051.0 1.0
sigma[12] 1.216 0.350 0.674 1.867 0.004 0.003 7959.0 7134.0 9447.0 8703.0 1.0
sigma[13] 0.762 0.154 0.493 1.059 0.001 0.001 11400.0 10795.0 12100.0 11367.0 1.0
sigma[14] 2.264 1.260 0.738 4.387 0.019 0.013 4617.0 4423.0 6285.0 6443.0 1.0
sigma[15] 2.666 4.789 0.020 8.488 0.117 0.083 1668.0 1668.0 2562.0 2362.0 1.0
In [207]:
plt.figure(figsize=(30,10))
sns.countplot(x=df2[df2.V33!='-1'].Community,hue=df2[df2.V33!='-1'].V33)
Out[207]:
<AxesSubplot:xlabel='Community', ylabel='count'>
2020-09-18T11:26:35.053472 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
In [ ]:
mu2 = trace2['mu']
sigma2 = trace2['sigma']

# Concatenate the fixed thresholds into the estimated thresholds
n = trace2['theta_missing'].shape[0]
thresholds2 = np.c_[np.tile([1.5], (n,1)),
                    trace2['theta_missing'],
                    np.tile([4.5], (n,1))]

fig, axes = plt.subplots(5,2, figsize=(10,14))
ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10 = axes.flatten() 

# Mu
pm.plot_posterior(mu2[:,0], point_estimate='mode', color=color, ax=ax1)
ax1.set_xlabel('$\mu_{1}$', fontdict=f_dict)
pm.plot_posterior(mu2[:,1], point_estimate='mode', color=color, ax=ax3)
ax3.set_xlabel('$\mu_{2}$', fontdict=f_dict)
for title, ax in zip(['Comm.Coala Mean', 'Comm.Linux_Kernel Mean'], [ax1, ax3]):
    ax.set_title(title, fontdict=f_dict)

# Sigma
pm.plot_posterior(sigma2[:,0], point_estimate='mode', color=color, ax=ax5)
ax5.set_xlabel('$\sigma_{1}$', fontdict=f_dict)
pm.plot_posterior(sigma2[:,1], point_estimate='mode', color=color, ax=ax7)
ax7.set_xlabel('$\sigma_{2}$', fontdict=f_dict)
for title, ax in zip(['Comm.Coala Std. Dev.', 'Comm.Linux_Kernel Std. Dev.'], [ax5, ax7]):
    ax.set_title(title, fontdict=f_dict)

# Posterior distribution on the thresholds
ax9.scatter(thresholds2, np.tile(thresholds2.mean(axis=1).reshape(-1,1), (1,4)), color=color, alpha=.6, facecolor='none')
ax9.set_ylabel('Mean Threshold', fontdict=f_dict)
ax9.set_xlabel('Threshold', fontdict=f_dict)
ax9.vlines(x = thresholds2.mean(axis=0),
           ymin=thresholds2.mean(axis=1).min(),
           ymax=thresholds2.mean(axis=1).max(), linestyles='dotted', colors=color)

# Posterior predictive probabilities of the outcomes
threshCumProb2A = np.empty(thresholds2.shape)
for i in np.arange(threshCumProb2A.shape[0]):
    threshCumProb2A[i] = norm().cdf((thresholds2[i] - mu2[i,0])/sigma2[i,0])    
outProb2A = (np.c_[threshCumProb2A, np.tile(1, (thresholds2.shape[0],1))]
           - np.c_[np.tile(0, (thresholds2.shape[0],1)), threshCumProb2A])
yerr2A = np.abs(np.subtract(pm.hpd(outProb2A), outProb2A.mean(axis=0).reshape(-1,1)))

ax2.errorbar(x = np.arange(outProb2A.shape[1]), y=outProb2A.mean(axis=0),
             yerr=yerr2A.T, color=color, fmt='o')

threshCumProb2B = np.empty(thresholds2.shape)
for i in np.arange(threshCumProb2B.shape[0]):
    threshCumProb2B[i] = norm().cdf((thresholds2[i] - mu2[i,1])/sigma2[i,1])    
outProb2B = (np.c_[threshCumProb2B, np.tile(1, (thresholds2.shape[0],1))]
           - np.c_[np.tile(0, (thresholds2.shape[0],1)), threshCumProb2B])
yerr2B = np.abs(np.subtract(pm.hpd(outProb2B), outProb2B.mean(axis=0).reshape(-1,1)))

ax4.errorbar(x = np.arange(outProb2B.shape[1]), y=outProb2B.mean(axis=0),
             yerr=yerr2B.T, color=color, fmt='o')

for grp, ax in zip(['Comm.Coala', 'Comm.Linux_Kernel'], [ax2, ax4]):
    ((df2[df2.Community == grp].V28.value_counts().sort_index()/df2[df2.Community == grp].V28.size)
     .plot.bar(ax=ax, rot=0, color='royalblue'))
    ax.set_title('Data for {0} with Post. Pred.\nN = {1}'.format(grp, df2[df2.Community == grp].V28.size), fontdict=f_dict)
    ax.set_xlabel('y')
    sns.despine(ax=ax, left=True)
    ax.yaxis.set_visible(False)

# Mu diff
pm.plot_posterior(mu2[:,1]-mu2[:,0], point_estimate='mode', color=color, ax=ax6)
ax6.set_xlabel('$\mu_{2}-\mu_{1}$', fontdict=f_dict)
# Sigma diff
pm.plot_posterior(sigma2[:,1]-sigma2[:,0], point_estimate='mode', color=color, ax=ax8)
ax8.set_xlabel('$\sigma_{2}-\sigma_{1}$', fontdict=f_dict)
# Effect size
pm.plot_posterior((mu2[:,1]-mu2[:,0]) / np.sqrt((sigma2[:,0]**2+sigma2[:,1]**2)/2), point_estimate='mode', color=color, ax=ax10)
ax10.set_xlabel(r'$\frac{(\mu_2-\mu_1)}{\sqrt{(\sigma_1^2+\sigma_2^2)/2}}$', fontdict=f_dict)
for title, ax in zip(['Differences of Means', 'Difference of Std. Dev\'s', 'Effect Size'], [ax6, ax8, ax10]):
    ax.set_title(title, fontdict=f_dict)

fig.suptitle('V28: I do not consider a pull request/patch, unless I trust the contributor.', fontsize=16)
fig.tight_layout()
In [254]:
def count_per_data_value(d,c,q):
    return np.array([ len(d[(d[q] == x) & (d.Community==c)][q]) for x in ['1','2','3','4','5']])
In [289]:
def comm_to_code(d,c):
    return [code for (comm,code) in zip(d.Community.unique(),d.Community.cat.codes.unique()) if comm==c][0]
In [295]:
comm_to_code(df2,'-1')
Out[295]:
0
In [298]:
def plot_ppc_normal_density(d,t,c,q):
    c_code=comm_to_code(d,c)
    x=np.arange(-5,10,step=.01)
    s=pd.Series((count_per_data_value(df2,c,q)/count_per_data_value(df2,c,q).sum()))
    # s.index=s.index+1
    s.plot.bar(rot=0, color='royalblue')
    plt.suptitle('Data of '+c+' question '+q)
    [plt.plot(x,norm(loc=m,scale=s).pdf(x),color='lightblue',alpha=.05) 
        for (m,s) in zip(t['mu'][18000:,c_code],t['sigma'][18000:,  c_code])]
    plt.plot(x,norm(loc=np.mean(t['mu'][:,c_code]),scale=np.mean(t['sigma'][:,c_code])).pdf(x),color='blue',linewidth=2)
    plt.xlim([-1,7])
    plt.show()
In [301]:
[plot_ppc_normal_density(df2,t,c,'V33') for c in df2.Community.unique()]
2020-09-18T14:27:46.710895 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:27:52.117502 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:27:57.446698 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:28:03.431838 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:28:08.411460 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:28:14.980724 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:28:20.448478 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:28:26.728312 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:28:31.974926 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:28:37.375092 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:28:44.582321 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:28:49.945971 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:28:56.757047 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:29:02.279716 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:29:07.788867 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
2020-09-18T14:29:14.323878 image/svg+xml Matplotlib v3.3.1, https://matplotlib.org/
Out[301]:
[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]
In [ ]:
pm.summary(trace2)
In [ ]:
# plt.subplot(211)
x_axis = np.arange(-10, 15, 0.01)
(((df2[df2.Community == 'Comm.Linux_Kernel'].V28.value_counts().sort_index()*.4)/df2[df2.Community == 'Comm.Linux_Kernel'].V28.size)
     .plot.bar(rot=0, color='purple'))
[plt.plot(x_axis, norm.pdf(x_axis,m,s),color='royalblue',alpha=.05) for (m,s) in zip(trace2['mu'][18000:,1],trace2['sigma'][18500:,1])]
plt.plot(x_axis, norm.pdf(x_axis,np.mean(trace2['mu'][:,1]),np.mean(trace2['sigma'][:,1])),color='black',linewidth=2)
plt.show()

# plt.subplot(212)
(((df2[df2.Community == 'Comm.Coala'].V28.value_counts().sort_index()*.4)/df2[df2.Community == 'Comm.Coala'].V28.size)
     .plot.bar(rot=0, color='purple'))
[plt.plot(x_axis, norm.pdf(x_axis,m,s),color='royalblue',alpha=.05) for (m,s) in zip(trace2['mu'][18000:,0],trace2['sigma'][18500:,0])]
plt.plot(x_axis, norm.pdf(x_axis,np.mean(trace2['mu'][:,0]),np.mean(trace2['sigma'][:,0])),color='black',linewidth=2)
plt.show()